In [1]:
import xgboost as xgb
from sklearn import model_selection
from sklearn import metrics
import numpy as np
import pandas as pd
import os
import pickle
import itertools
import random
import math
import time
from datetime import timedelta
%matplotlib inline
np.set_printoptions(precision=2, suppress=True)
In [2]:
DATA_DIRECTORY = "../data/topviewkinect/chi"
DATA_INFO = ""
In [3]:
all_features_csv = "{data_dir}/{data}features.csv".format(data_dir=DATA_DIRECTORY, data=DATA_INFO)
all_labels_csv = "{data_dir}/{data}labels.csv".format(data_dir=DATA_DIRECTORY, data=DATA_INFO)
all_features_df = pd.read_csv(all_features_csv)
all_labels_df = pd.read_csv(all_labels_csv)
In [4]:
all_features_df.shape, all_labels_df.shape
Out[4]:
In [5]:
all_features_df.head()
Out[5]:
In [6]:
all_labels_df.head()
Out[6]:
In [7]:
subjects_list = np.unique(all_labels_df["subject"])
subjects_list
Out[7]:
In [8]:
activities_list = np.unique(all_labels_df["activity"])
activities_list
Out[8]:
In [9]:
activity_samples_df = pd.DataFrame(0, index=np.arange(6), columns=np.arange(12))
for subject_id in range(1, 13):
d = all_labels_df[all_labels_df["subject"] == subject_id]
for activity_id in range(6):
activity_samples_df.ix[activity_id][subject_id-1] = len(d[d["activity"] == activity_id])
activity_samples_df
Out[9]:
In [10]:
median_activity_samples = [int(np.median(activity_samples_df.iloc[i])) for i in range(6)]
median_activity_samples
Out[10]:
In [11]:
def sample_data(data, n, random_state):
data_size = len(data)
if data_size <= n:
return data.sample(n=n, replace=True, random_state=random_state)
else:
sampled_data = pd.DataFrame(columns=data.columns)
stride = math.ceil(data_size / n)
i = 0
current_n = 0
while current_n < n:
sampled_data = sampled_data.append(data.iloc[i], ignore_index=True)
i += stride
if i+1 > data_size:
i = data_size - i+1
current_n += 1
return sampled_data
In [12]:
X_df = pd.DataFrame([])
y_df = pd.DataFrame([])
for subject_id in subjects_list:
print("Subject", subject_id)
for activity_id in activities_list:
subject_df = all_labels_df[all_labels_df["subject"] == subject_id]
subject_activity_df = subject_df[subject_df["activity"] == activity_id]
data_indices = subject_activity_df.index
subject_activity_y = subject_activity_df["activity"].values
subject_activity_X = all_features_df.iloc[data_indices]
subject_activity_X = subject_activity_X.assign(activity=subject_activity_y)
num_samples = median_activity_samples[activity_id]
subject_activity_X = sample_data(data=subject_activity_X, n=num_samples, random_state=42)
subject_activity_y = pd.DataFrame({
"subject": subject_activity_X["subject"],
"activity":subject_activity_X["activity"]
})
subject_activity_X.drop(labels="activity", axis=1, inplace=True)
X_df = X_df.append(subject_activity_X, ignore_index=True)
y_df = y_df.append(subject_activity_y, ignore_index=True)
In [13]:
X_df.head()
Out[13]:
In [14]:
y_df.head()
Out[14]:
In [16]:
X_df.shape, y_df.shape
Out[16]:
In [17]:
subject_train_indices = [1, 3, 5, 7, 9, 11]
subject_test_indices = [2, 4, 6, 8, 10, 12]
In [18]:
X_train_df = X_df[X_df["subject"].isin(subject_train_indices)].reset_index(drop=True)
y_train_df = y_df[y_df["subject"].isin(subject_train_indices)].reset_index(drop=True)
X_test_df = X_df[X_df["subject"].isin(subject_test_indices)].reset_index(drop=True)
y_test_df = y_df[y_df["subject"].isin(subject_test_indices)].reset_index(drop=True)
init_X_all = all_features_df.drop(labels="subject", axis=1).values
init_y_all = all_labels_df["activity"].values
X_train = X_train_df.drop(labels="subject", axis=1).values
y_train = y_train_df["activity"].values
X_test = X_test_df.drop(labels="subject", axis=1).values
y_test = y_test_df["activity"].values
X_all = np.concatenate([X_train, X_test])
y_all = np.concatenate([y_train, y_test])
In [19]:
init_X_all.shape, init_y_all.shape
Out[19]:
In [20]:
X_train.shape, y_train.shape
Out[20]:
In [21]:
X_test.shape, y_test.shape
Out[21]:
In [22]:
X_all.shape, y_all.shape
Out[22]:
In [23]:
init_all_dmatrix = xgb.DMatrix(init_X_all, init_y_all)
train_dmatrix = xgb.DMatrix(X_train, y_train)
test_dmatrix = xgb.DMatrix(X_test, y_test)
all_dmatrix = xgb.DMatrix(X_all, y_all)
In [23]:
cv = []
for subject_id in subject_train_indices:
train_indices = y_train_df[y_train_df["subject"] != subject_id].index.tolist()
validation_indices = y_train_df[y_train_df["subject"] == subject_id].index.tolist()
cv.append((train_indices, validation_indices))
In [24]:
def cv_tune_num_boost_round(X, y, cv, params, num_boost_round, early_stopping, verbose):
start = time.time()
cv_errors = list()
for cv_idx, (train_indices, validation_indices) in enumerate(cv):
print("| CV:", cv_idx)
cv_X_train = np.take(X, train_indices, axis=0)
cv_y_train = np.take(y, train_indices, axis=0)
cv_train_dmatrix = xgb.DMatrix(cv_X_train, cv_y_train)
cv_X_validation = np.take(X, validation_indices, axis=0)
cv_y_validation = np.take(y, validation_indices, axis=0)
cv_validation_dmatrix = xgb.DMatrix(cv_X_validation, cv_y_validation)
watchlist = [(cv_train_dmatrix, "train"), (cv_validation_dmatrix, "eval")]
cv_result = {}
model = xgb.train(params=params, dtrain=cv_train_dmatrix, evals=watchlist, evals_result=cv_result,
num_boost_round=num_boost_round, verbose_eval=verbose)
cv_errors.append(cv_result)
print("\n| Elapsed: {elapsed}\n".format(elapsed=timedelta(seconds=(time.time() - start))))
validation_errors = [errors["eval"]["merror"] for errors in cv_errors]
validation_avg_errors = [np.median(errors) for errors in zip(*validation_errors)]
for i in range(early_stopping - 1, num_boost_round):
validation_range_start = i - (early_stopping - 1)
validation_range_errors = validation_avg_errors[validation_range_start:i+1]
if validation_range_errors[0] <= np.min(validation_range_errors):
return cv_errors, np.min(validation_avg_errors[0:i+1]), np.argmin(validation_avg_errors[0:i+1])+1
return cv_errors, np.min(validation_avg_errors), np.argmin(validation_avg_errors)+1
In [25]:
def cv_tune_tree_booster(X, y, params, cv_params, cv, n_jobs=-1, verbose=2):
cv_model = model_selection.GridSearchCV(
xgb.XGBClassifier(**params), param_grid=cv_params, cv=cv, n_jobs=n_jobs, verbose=verbose)
cv_model.fit(X, y)
return cv_model
In [26]:
params = {
"learning_rate": 0.05,
"n_estimators": 100,
"max_depth": 5,
"min_child_weight": 1,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [27]:
print("----------------------------------")
params
Out[27]:
In [28]:
cv_errors, min_error, n_estimators = cv_tune_num_boost_round(
X=X_train, y=y_train, cv=cv, params=params, num_boost_round=500, early_stopping=50, verbose=50)
In [29]:
print("----------------------------------")
print("n_estimators =", n_estimators)
print("min_error =", min_error)
In [30]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": 5,
"min_child_weight": 1,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [31]:
print("----------------------------------")
params
Out[31]:
In [32]:
booster = xgb.train(params=params, dtrain=train_dmatrix, num_boost_round=params["n_estimators"])
In [33]:
y_predicted = booster.predict(test_dmatrix)
In [34]:
accuracy = metrics.accuracy_score(y_test, y_predicted)
cm = metrics.confusion_matrix(y_test, y_predicted)
cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
cm *= 100
print(accuracy)
print(cm)
In [35]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": 5,
"min_child_weight": 1,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"silent": 0,
"seed": 42
}
cv_params = {
"max_depth": list(range(3,10,2)),
"min_child_weight": list(range(1,10,2))
}
In [36]:
print("----------------------------------")
params
cv_params
Out[36]:
In [37]:
cv_model = cv_tune_tree_booster(
X=X_train, y=y_train, params=params, cv_params=cv_params, cv=cv)
max_depth, min_child_weight = cv_model.best_params_["max_depth"], cv_model.best_params_["min_child_weight"]
In [38]:
print("----------------------------------")
cv_results = cv_model.cv_results_
for param_idx, param in enumerate(cv_results["params"]):
print("{param} - mean: {mean:.6f}, std: {std:.6f}".format(
param=param, mean=cv_results["mean_test_score"][param_idx], std=cv_results["std_test_score"][param_idx]))
print("----------------------------------")
print("max_depth =", max_depth)
print("min_child_weight =", min_child_weight)
print("best score =", cv_model.best_score_)
In [39]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": 5,
"min_child_weight": 1,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"silent": 0,
"seed": 42
}
cv_params = {
"max_depth": [max_depth-1, max_depth, max_depth+1],
"min_child_weight": [min_child_weight-1, min_child_weight, min_child_weight+1]
}
In [40]:
print("----------------------------------")
params
cv_params
Out[40]:
In [41]:
cv_model = cv_tune_tree_booster(
X=X_train, y=y_train, params=params, cv_params=cv_params, cv=cv)
max_depth, min_child_weight = cv_model.best_params_["max_depth"], cv_model.best_params_["min_child_weight"]
In [42]:
print("----------------------------------")
cv_results = cv_model.cv_results_
for param_idx, param in enumerate(cv_results["params"]):
print("{param} - mean: {mean:.6f}, std: {std:.6f}".format(
param=param, mean=cv_results["mean_test_score"][param_idx], std=cv_results["std_test_score"][param_idx]))
print("----------------------------------")
print("max_depth =", max_depth)
print("min_child_weight =", min_child_weight)
print("best score =", cv_model.best_score_)
In [43]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [44]:
print("----------------------------------")
params
Out[44]:
In [45]:
cv_errors, min_error, n_estimators = cv_tune_num_boost_round(
X=X_train, y=y_train, cv=cv, params=params, num_boost_round=500, early_stopping=50, verbose=50)
In [46]:
print("----------------------------------")
print("n_estimators =", n_estimators)
print("min_error =", min_error)
In [47]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [48]:
print("----------------------------------")
params
Out[48]:
In [49]:
booster = xgb.train(params=params, dtrain=train_dmatrix, num_boost_round=params["n_estimators"])
In [50]:
y_predicted = booster.predict(test_dmatrix)
In [51]:
accuracy = metrics.accuracy_score(y_test, y_predicted)
cm = metrics.confusion_matrix(y_test, y_predicted)
cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
cm *= 100
print(accuracy)
print(cm)
In [52]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": 0.5,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"silent": 0,
"seed": 42
}
cv_params = {
"gamma": [i/10.0 for i in range(0, 11)]
}
In [53]:
print("----------------------------------")
params
cv_params
Out[53]:
In [54]:
cv_model = cv_tune_tree_booster(
X=X_train, y=y_train, params=params, cv_params=cv_params, cv=cv)
gamma = cv_model.best_params_["gamma"]
In [55]:
print("----------------------------------")
cv_results = cv_model.cv_results_
for param_idx, param in enumerate(cv_results["params"]):
print("{param} - mean: {mean:.6f}, std: {std:.6f}".format(
param=param, mean=cv_results["mean_test_score"][param_idx], std=cv_results["std_test_score"][param_idx]))
print("----------------------------------")
print("gamma =", gamma)
In [56]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": gamma,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [59]:
print("----------------------------------")
params
Out[59]:
In [61]:
cv_errors, min_error, n_estimators = cv_tune_num_boost_round(
X=X_train, y=y_train, cv=cv, params=params, num_boost_round=500, early_stopping=50, verbose=50)
In [62]:
print("----------------------------------")
print("n_estimators =", n_estimators)
print("min_error =", min_error)
In [63]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": gamma,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [65]:
booster = xgb.train(params=params, dtrain=train_dmatrix, num_boost_round=params["n_estimators"])
In [70]:
y_predicted = booster.predict(test_dmatrix)
In [71]:
accuracy = metrics.accuracy_score(y_test, y_predicted)
cm = metrics.confusion_matrix(y_test, y_predicted)
cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
cm *= 100
print(accuracy)
print(cm)
In [72]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": gamma,
"subsample": 0.5,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"silent": 0,
"seed": 42
}
cv_params = {
"subsample": [i/10.0 for i in range(5,11)]
}
In [73]:
print("----------------------------------")
params
cv_params
Out[73]:
In [74]:
cv_model = cv_tune_tree_booster(
X=X_train, y=y_train, params=params, cv_params=cv_params, cv=cv)
subsample = cv_model.best_params_["subsample"]
In [75]:
print("----------------------------------")
cv_results = cv_model.cv_results_
for param_idx, param in enumerate(cv_results["params"]):
print("{param} - mean: {mean:.6f}, std: {std:.6f}".format(
param=param, mean=cv_results["mean_test_score"][param_idx], std=cv_results["std_test_score"][param_idx]))
print("----------------------------------")
print("subsample =", subsample)
In [60]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": gamma,
"subsample": subsample,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [61]:
print("----------------------------------")
params
Out[61]:
In [62]:
cv_errors, min_error, n_estimators = cv_tune_num_boost_round(
X=X_train, y=y_train, cv=cv, params=params, num_boost_round=500, early_stopping=50, verbose=50)
In [64]:
print("----------------------------------")
print("n_estimators =", n_estimators)
print("min_error =", min_error)
In [29]:
params = {
"learning_rate": 0.05,
"n_estimators": n_estimators,
"max_depth": max_depth,
"min_child_weight": min_child_weight,
"max_delta_step": 1,
"gamma": gamma,
"subsample": subsample,
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"reg_lambda": 1,
"reg_alpha": 0,
"scale_pos_weight": 1,
"objective": "multi:softmax",
"eval_metric": "merror",
"num_class": 6,
"silent": 0,
"seed": 42
}
In [30]:
params
Out[30]:
In [31]:
booster_train = xgb.train(params=params, dtrain=train_dmatrix, num_boost_round=params["n_estimators"])
In [32]:
y_predicted = booster_train.predict(test_dmatrix)
In [33]:
accuracy = metrics.accuracy_score(y_test, y_predicted)
cm = metrics.confusion_matrix(y_test, y_predicted)
cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
cm *= 100
print(accuracy)
print(cm)
In [36]:
booster_train.save_model("initial_cs_train.model")
In [35]:
booster_all = xgb.train(params=params, dtrain=all_dmatrix, num_boost_round=params["n_estimators"])
In [37]:
booster_all.save_model("initial_cs_all.model")